{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install bert-tensorflow"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download BERT-Base model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
    "# !unzip cased_L-12_H-768_A-12.zip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download simple dataset\n",
    "\n",
    "I want to use negative sentiment corpus to build unsupervised topic models using Attention from BERT."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5330"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/NLP-Models-Tensorflow/master/text-classification/data/negative/negative\n",
    "\n",
    "with open('negative') as fopen:\n",
    "    negative = fopen.read().split('\\n')[:-1]\n",
    "len(negative)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import bert\n",
    "from bert import run_classifier\n",
    "from bert import optimization\n",
    "from bert import tokenization\n",
    "from bert import modeling\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "BERT_VOCAB = 'cased_L-12_H-768_A-12/vocab.txt'\n",
    "BERT_INIT_CHKPNT = 'cased_L-12_H-768_A-12/bert_model.ckpt'\n",
    "BERT_CONFIG = 'cased_L-12_H-768_A-12/bert_config.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_ngram(seq, ngram = (1, 3)):\n",
    "    g = []\n",
    "    for i in range(ngram[0], ngram[-1] + 1):\n",
    "        g.extend(list(ngrams_generator(seq, i)))\n",
    "    return g\n",
    "\n",
    "def _pad_sequence(\n",
    "    sequence,\n",
    "    n,\n",
    "    pad_left = False,\n",
    "    pad_right = False,\n",
    "    left_pad_symbol = None,\n",
    "    right_pad_symbol = None,\n",
    "):\n",
    "    sequence = iter(sequence)\n",
    "    if pad_left:\n",
    "        sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)\n",
    "    if pad_right:\n",
    "        sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))\n",
    "    return sequence\n",
    "\n",
    "\n",
    "def ngrams_generator(\n",
    "    sequence,\n",
    "    n,\n",
    "    pad_left = False,\n",
    "    pad_right = False,\n",
    "    left_pad_symbol = None,\n",
    "    right_pad_symbol = None,\n",
    "):\n",
    "    \"\"\"\n",
    "    generate ngrams.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    sequence : list of str\n",
    "        list of tokenize words.\n",
    "    n : int\n",
    "        ngram size\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    ngram: list\n",
    "    \"\"\"\n",
    "    sequence = _pad_sequence(\n",
    "        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol\n",
    "    )\n",
    "\n",
    "    history = []\n",
    "    while n > 1:\n",
    "        try:\n",
    "            next_item = next(sequence)\n",
    "        except StopIteration:\n",
    "            return\n",
    "        history.append(next_item)\n",
    "        n -= 1\n",
    "    for item in sequence:\n",
    "        history.append(item)\n",
    "        yield tuple(history)\n",
    "        del history[0]\n",
    "\n",
    "def merge_wordpiece_tokens(paired_tokens, weighted = True):\n",
    "    new_paired_tokens = []\n",
    "    n_tokens = len(paired_tokens)\n",
    "\n",
    "    i = 0\n",
    "\n",
    "    while i < n_tokens:\n",
    "        current_token, current_weight = paired_tokens[i]\n",
    "        if current_token.startswith('##'):\n",
    "            previous_token, previous_weight = new_paired_tokens.pop()\n",
    "            merged_token = previous_token\n",
    "            merged_weight = [previous_weight]\n",
    "            while current_token.startswith('##'):\n",
    "                merged_token = merged_token + current_token.replace('##', '')\n",
    "                merged_weight.append(current_weight)\n",
    "                i = i + 1\n",
    "                current_token, current_weight = paired_tokens[i]\n",
    "            merged_weight = np.mean(merged_weight)\n",
    "            new_paired_tokens.append((merged_token, merged_weight))\n",
    "\n",
    "        else:\n",
    "            new_paired_tokens.append((current_token, current_weight))\n",
    "            i = i + 1\n",
    "\n",
    "    words = [\n",
    "        i[0]\n",
    "        for i in new_paired_tokens\n",
    "        if i[0] not in ['[CLS]', '[SEP]', '[PAD]']\n",
    "    ]\n",
    "    weights = [\n",
    "        i[1]\n",
    "        for i in new_paired_tokens\n",
    "        if i[0] not in ['[CLS]', '[SEP]', '[PAD]']\n",
    "    ]\n",
    "    if weighted:\n",
    "        weights = np.array(weights)\n",
    "        weights = weights / np.sum(weights)\n",
    "    return list(zip(words, weights))\n",
    "\n",
    "def _extract_attention_weights(num_layers, tf_graph):\n",
    "    attns = [\n",
    "        {\n",
    "            'layer_%s'\n",
    "            % i: tf_graph.get_tensor_by_name(\n",
    "                'bert/encoder/layer_%s/attention/self/Softmax:0' % i\n",
    "            )\n",
    "        }\n",
    "        for i in range(num_layers)\n",
    "    ]\n",
    "\n",
    "    return attns\n",
    "\n",
    "def padding_sequence(seq, maxlen, padding = 'post', pad_int = 0):\n",
    "    padded_seqs = []\n",
    "    for s in seq:\n",
    "        if padding == 'post':\n",
    "            padded_seqs.append(s + [pad_int] * (maxlen - len(s)))\n",
    "        if padding == 'pre':\n",
    "            padded_seqs.append([pad_int] * (maxlen - len(s)) + s)\n",
    "    return padded_seqs\n",
    "\n",
    "\n",
    "def bert_tokenization(tokenizer, texts, cls = '[CLS]', sep = '[SEP]'):\n",
    "\n",
    "    input_ids, input_masks, segment_ids, s_tokens = [], [], [], []\n",
    "    for text in texts:\n",
    "        tokens_a = tokenizer.tokenize(text)\n",
    "        tokens = [cls] + tokens_a + [sep]\n",
    "        segment_id = [0] * len(tokens)\n",
    "        input_id = tokenizer.convert_tokens_to_ids(tokens)\n",
    "        input_mask = [1] * len(input_id)\n",
    "\n",
    "        input_ids.append(input_id)\n",
    "        input_masks.append(input_mask)\n",
    "        segment_ids.append(segment_id)\n",
    "        s_tokens.append(tokens)\n",
    "\n",
    "    maxlen = max([len(i) for i in input_ids])\n",
    "    input_ids = padding_sequence(input_ids, maxlen)\n",
    "    input_masks = padding_sequence(input_masks, maxlen)\n",
    "    segment_ids = padding_sequence(segment_ids, maxlen)\n",
    "\n",
    "    return input_ids, input_masks, segment_ids, s_tokens\n",
    "\n",
    "class _Model:\n",
    "    def __init__(self, bert_config, tokenizer):\n",
    "        _graph = tf.Graph()\n",
    "        with _graph.as_default():\n",
    "            self.X = tf.placeholder(tf.int32, [None, None])\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "            self.model = modeling.BertModel(\n",
    "                config = bert_config,\n",
    "                is_training = False,\n",
    "                input_ids = self.X,\n",
    "                use_one_hot_embeddings = False,\n",
    "            )\n",
    "            self.logits = self.model.get_pooled_output()\n",
    "            self._sess = tf.InteractiveSession()\n",
    "            self._sess.run(tf.global_variables_initializer())\n",
    "            var_lists = tf.get_collection(\n",
    "                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert'\n",
    "            )\n",
    "            self._saver = tf.train.Saver(var_list = var_lists)\n",
    "            attns = _extract_attention_weights(\n",
    "                bert_config.num_hidden_layers, tf.get_default_graph()\n",
    "            )\n",
    "            self.attns = attns\n",
    "\n",
    "    def vectorize(self, strings):\n",
    "\n",
    "        \"\"\"\n",
    "        Vectorize string inputs using bert attention.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        strings : str / list of str\n",
    "\n",
    "        Returns\n",
    "        -------\n",
    "        array: vectorized strings\n",
    "        \"\"\"\n",
    "\n",
    "        if isinstance(strings, list):\n",
    "            if not isinstance(strings[0], str):\n",
    "                raise ValueError('input must be a list of strings or a string')\n",
    "        else:\n",
    "            if not isinstance(strings, str):\n",
    "                raise ValueError('input must be a list of strings or a string')\n",
    "        if isinstance(strings, str):\n",
    "            strings = [strings]\n",
    "\n",
    "        batch_x, _, _, _ = bert_tokenization(self._tokenizer, strings)\n",
    "        return self._sess.run(self.logits, feed_dict = {self.X: batch_x})\n",
    "\n",
    "    def attention(self, strings, method = 'last', **kwargs):\n",
    "        \"\"\"\n",
    "        Get attention string inputs from bert attention.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        strings : str / list of str\n",
    "        method : str, optional (default='last')\n",
    "            Attention layer supported. Allowed values:\n",
    "\n",
    "            * ``'last'`` - attention from last layer.\n",
    "            * ``'first'`` - attention from first layer.\n",
    "            * ``'mean'`` - average attentions from all layers.\n",
    "\n",
    "        Returns\n",
    "        -------\n",
    "        array: attention\n",
    "        \"\"\"\n",
    "\n",
    "        if isinstance(strings, list):\n",
    "            if not isinstance(strings[0], str):\n",
    "                raise ValueError('input must be a list of strings or a string')\n",
    "        else:\n",
    "            if not isinstance(strings, str):\n",
    "                raise ValueError('input must be a list of strings or a string')\n",
    "        if isinstance(strings, str):\n",
    "            strings = [strings]\n",
    "\n",
    "        method = method.lower()\n",
    "        if method not in ['last', 'first', 'mean']:\n",
    "            raise Exception(\n",
    "                \"method not supported, only support 'last', 'first' and 'mean'\"\n",
    "            )\n",
    "\n",
    "        batch_x, _, _, s_tokens = bert_tokenization(self._tokenizer, strings)\n",
    "        maxlen = max([len(s) for s in s_tokens])\n",
    "        s_tokens = padding_sequence(s_tokens, maxlen, pad_int = '[SEP]')\n",
    "        attentions = self._sess.run(self.attns, feed_dict = {self.X: batch_x})\n",
    "        if method == 'first':\n",
    "            cls_attn = list(attentions[0].values())[0][:, :, 0, :]\n",
    "\n",
    "        if method == 'last':\n",
    "            cls_attn = list(attentions[-1].values())[0][:, :, 0, :]\n",
    "\n",
    "        if method == 'mean':\n",
    "            combined_attentions = []\n",
    "            for a in attentions:\n",
    "                combined_attentions.append(list(a.values())[0])\n",
    "            cls_attn = np.mean(combined_attentions, axis = 0).mean(axis = 2)\n",
    "\n",
    "        cls_attn = np.mean(cls_attn, axis = 1)\n",
    "        total_weights = np.sum(cls_attn, axis = -1, keepdims = True)\n",
    "        attn = cls_attn / total_weights\n",
    "        output = []\n",
    "        for i in range(attn.shape[0]):\n",
    "            output.append(\n",
    "                merge_wordpiece_tokens(list(zip(s_tokens[i], attn[i])))\n",
    "            )\n",
    "        return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0820 00:50:25.676800 139771824637760 deprecation_wrapper.py:119] From /home/husein/.local/lib/python3.6/site-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
      "\n",
      "W0820 00:50:25.755635 139771824637760 deprecation_wrapper.py:119] From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n",
      "\n",
      "W0820 00:50:25.757595 139771824637760 deprecation_wrapper.py:119] From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:409: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.\n",
      "\n",
      "W0820 00:50:25.783736 139771824637760 deprecation_wrapper.py:119] From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n",
      "\n",
      "W0820 00:50:26.212700 139771824637760 lazy_loader.py:50] \n",
      "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
      "For more information, please see:\n",
      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
      "  * https://github.com/tensorflow/addons\n",
      "  * https://github.com/tensorflow/io (for I/O related ops)\n",
      "If you depend on functionality not listed there, please file an issue.\n",
      "\n",
      "W0820 00:50:26.247612 139771824637760 deprecation.py:323] From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use keras.layers.dense instead.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=False)\n",
    "bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)\n",
    "model = _Model(bert_config, tokenizer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Test vectorization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.55213624, -0.33787724,  0.74862313, ..., -0.04363263,\n",
       "         0.31521446,  0.07524541],\n",
       "       [ 0.59046894, -0.304328  ,  0.7821516 , ..., -0.16189037,\n",
       "         0.367751  ,  0.07440313]], dtype=float32)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v = model.vectorize(['hello nice to meet u', 'so long sucker'])\n",
    "v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2, 768)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Test attention"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('hello', 0.19323255),\n",
       "  ('nice', 0.19877374),\n",
       "  ('to', 0.19795448),\n",
       "  ('meet', 0.20197453),\n",
       "  ('u', 0.20806469)],\n",
       " [('so', 0.34224316), ('long', 0.31957355), ('sucker', 0.3381833)]]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.attention(['hello nice to meet u', 'so long sucker'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building topic modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 10\n",
    "ngram = (1, 3)\n",
    "n_topics = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 533/533 [01:11<00:00,  7.32it/s]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from tqdm import tqdm\n",
    "\n",
    "rows, attentions = [], []\n",
    "for i in tqdm(range(0, len(negative), batch_size)):\n",
    "    index = min(i + batch_size, len(negative))\n",
    "    rows.append(model.vectorize(negative[i:index]))\n",
    "    attentions.extend(model.attention(negative[i:index]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Download simple english stopwords\n",
    "\n",
    "You might want to gather more of stopwords."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1298"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "with open('stopwords-en.json') as fopen:\n",
    "    stopwords = json.load(fopen)\n",
    "len(stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"'ll\", \"'tis\", \"'twas\", \"'ve\", '10']"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processed 500\n",
      "processed 1000\n",
      "processed 1500\n",
      "processed 2000\n",
      "processed 2500\n",
      "processed 3000\n",
      "processed 3500\n",
      "processed 4000\n",
      "processed 4500\n",
      "processed 5000\n"
     ]
    }
   ],
   "source": [
    "concat = np.concatenate(rows, axis = 0)\n",
    "kmeans = KMeans(n_clusters = n_topics, random_state = 0).fit(concat)\n",
    "labels = kmeans.labels_\n",
    "\n",
    "overall, filtered_a = [], []\n",
    "for a in attentions:\n",
    "    f = [i for i in a if i[0] not in stopwords]\n",
    "    overall.extend(f)\n",
    "    filtered_a.append(f)\n",
    "\n",
    "o_ngram = generate_ngram(overall, ngram)\n",
    "features = []\n",
    "for i in o_ngram:\n",
    "    features.append(' '.join([w[0] for w in i]))\n",
    "features = list(set(features))\n",
    "\n",
    "components = np.zeros((n_topics, len(features)))\n",
    "for no, i in enumerate(labels):\n",
    "    if (no + 1) % 500 == 0:\n",
    "        print('processed %d'%(no + 1))\n",
    "    f = generate_ngram(filtered_a[no], ngram)\n",
    "    for w in f:\n",
    "        word = ' '.join([r[0] for r in w])\n",
    "        score = np.mean([r[1] for r in w])\n",
    "        if word in features:\n",
    "            components[i, features.index(word)] += score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_topics_modelling(\n",
    "    topics, feature_names, sorting, n_words = 20, return_df = True\n",
    "):\n",
    "    if return_df:\n",
    "        try:\n",
    "            import pandas as pd\n",
    "        except:\n",
    "            raise Exception(\n",
    "                'pandas not installed. Please install it and try again or set `return_df = False`'\n",
    "            )\n",
    "    df = {}\n",
    "    for i in range(topics):\n",
    "        words = []\n",
    "        for k in range(n_words):\n",
    "            words.append(feature_names[sorting[i, k]])\n",
    "        df['topic %d' % (i)] = words\n",
    "    if return_df:\n",
    "        return pd.DataFrame.from_dict(df)\n",
    "    else:\n",
    "        return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>topic 0</th>\n",
       "      <th>topic 1</th>\n",
       "      <th>topic 2</th>\n",
       "      <th>topic 3</th>\n",
       "      <th>topic 4</th>\n",
       "      <th>topic 5</th>\n",
       "      <th>topic 6</th>\n",
       "      <th>topic 7</th>\n",
       "      <th>topic 8</th>\n",
       "      <th>topic 9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "      <td>movie</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "      <td>film</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>comedy</td>\n",
       "      <td>characters</td>\n",
       "      <td>bad</td>\n",
       "      <td>plot</td>\n",
       "      <td>story</td>\n",
       "      <td>bad</td>\n",
       "      <td>characters</td>\n",
       "      <td>characters</td>\n",
       "      <td>story</td>\n",
       "      <td>story</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bad</td>\n",
       "      <td>story</td>\n",
       "      <td>story</td>\n",
       "      <td>bad</td>\n",
       "      <td>time</td>\n",
       "      <td>dull</td>\n",
       "      <td>time</td>\n",
       "      <td>story</td>\n",
       "      <td>comedy</td>\n",
       "      <td>films</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>lame</td>\n",
       "      <td>time</td>\n",
       "      <td>films</td>\n",
       "      <td>comedy</td>\n",
       "      <td>director</td>\n",
       "      <td>story</td>\n",
       "      <td>story</td>\n",
       "      <td>feels</td>\n",
       "      <td>bad</td>\n",
       "      <td>characters</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>dull</td>\n",
       "      <td>films</td>\n",
       "      <td>comedy</td>\n",
       "      <td>movies</td>\n",
       "      <td>movies</td>\n",
       "      <td>action</td>\n",
       "      <td>comedy</td>\n",
       "      <td>comedy</td>\n",
       "      <td>boring</td>\n",
       "      <td>time</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>silly</td>\n",
       "      <td>bad</td>\n",
       "      <td>time</td>\n",
       "      <td>story</td>\n",
       "      <td>bad</td>\n",
       "      <td>comedy</td>\n",
       "      <td>action</td>\n",
       "      <td>love</td>\n",
       "      <td>tale</td>\n",
       "      <td>comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>mess</td>\n",
       "      <td>minutes</td>\n",
       "      <td>characters</td>\n",
       "      <td>time</td>\n",
       "      <td>comedy</td>\n",
       "      <td>thriller</td>\n",
       "      <td>script</td>\n",
       "      <td>script</td>\n",
       "      <td>dull</td>\n",
       "      <td>bad</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>pretentious</td>\n",
       "      <td>action</td>\n",
       "      <td>movies</td>\n",
       "      <td>characters</td>\n",
       "      <td>characters</td>\n",
       "      <td>characters</td>\n",
       "      <td>films</td>\n",
       "      <td>character</td>\n",
       "      <td>predictable</td>\n",
       "      <td>script</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>stupid</td>\n",
       "      <td>plot</td>\n",
       "      <td>plot</td>\n",
       "      <td>hard</td>\n",
       "      <td>reason</td>\n",
       "      <td>feels</td>\n",
       "      <td>director</td>\n",
       "      <td>action</td>\n",
       "      <td>movies</td>\n",
       "      <td>action</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       topic 0     topic 1     topic 2     topic 3     topic 4     topic 5  \\\n",
       "0        movie       movie       movie       movie       movie       movie   \n",
       "1         film        film        film        film        film        film   \n",
       "2       comedy  characters         bad        plot       story         bad   \n",
       "3          bad       story       story         bad        time        dull   \n",
       "4         lame        time       films      comedy    director       story   \n",
       "5         dull       films      comedy      movies      movies      action   \n",
       "6        silly         bad        time       story         bad      comedy   \n",
       "7         mess     minutes  characters        time      comedy    thriller   \n",
       "8  pretentious      action      movies  characters  characters  characters   \n",
       "9       stupid        plot        plot        hard      reason       feels   \n",
       "\n",
       "      topic 6     topic 7      topic 8     topic 9  \n",
       "0       movie       movie        movie       movie  \n",
       "1        film        film         film        film  \n",
       "2  characters  characters        story       story  \n",
       "3        time       story       comedy       films  \n",
       "4       story       feels          bad  characters  \n",
       "5      comedy      comedy       boring        time  \n",
       "6      action        love         tale      comedy  \n",
       "7      script      script         dull         bad  \n",
       "8       films   character  predictable      script  \n",
       "9    director      action       movies      action  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print_topics_modelling(\n",
    "    10,\n",
    "    feature_names = np.array(features),\n",
    "    sorting = np.argsort(components)[:, ::-1],\n",
    "    n_words = 10,\n",
    "    return_df = True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
